# # install custom packages - for google collab
# !pip install datashader
# !pip install hdbscan
from platform import python_version
print("python {}".format(python_version()))
import pandas as pd
import numpy as np
print("pandas {}".format(pd.__version__))
print("numpy {}".format(np.__version__))
import seaborn as sns; sns.set()
from scipy.spatial import ConvexHull, convex_hull_plot_2d
import matplotlib.pyplot as plt
import holoviews as hv
import holoviews.operation.datashader as hd
import datashader as ds
import datashader.transfer_functions as tf
hd.shade.cmap=["lightblue", "darkblue"]
hv.extension('bokeh', 'matplotlib')
# https://datashader.org/getting_started/Interactivity.html
# https://stackoverflow.com/questions/54793910/how-to-make-the-holoviews-show-graph-on-google-colaboratory-notebook
%env HV_DOC_HTML=true
# set option to process raw data, False will read parsed data directly
DATA_OPTION_PRCESS_RAW = False
# set number of rows to work with
DATA_OPTION_NUM_ROWS = 2307 # total row of data - 2307
#DATA_OPTION_NUM_ROWS = None # all rows
# set paths to data files
RAW_DATA_FILE = 'raw_data/competition_dataset.csv'
PARSED_DATA_FILE = 'intermediate_data/competition_dataset_long_{}.csv'.format(DATA_OPTION_NUM_ROWS)
if DATA_OPTION_PRCESS_RAW:
# read raw data to process into parsed data
raw_df = pd.read_csv(RAW_DATA_FILE, header=0, skiprows=0,
nrows=DATA_OPTION_NUM_ROWS, delimiter=None)
parsed_df = raw_df.copy()
parsed_df['data'] = parsed_df.iloc[:, 0].str.split('; ')
parsed_df['count'] = parsed_df['data'].str.len()
parsed_df['count'] = (parsed_df['count'] - 4 - 1) / 6
parsed_df['count'] = parsed_df['count'].astype(int)
# credit: https://stackoverflow.com/a/59552714
spread_ixs = np.repeat(range(len(parsed_df)), parsed_df['count'])
# .drop(columns='count').reset_index(drop=True)
parsed_df = parsed_df.iloc[spread_ixs, :]
parsed_df['track_id'] = parsed_df['data'].str[0].astype(int)
parsed_df['grouped_row_id'] = parsed_df.groupby(
'track_id')['track_id'].rank(method='first').astype(int)
old_col = raw_df.columns.tolist()[0]
new_cols = old_col.split('; ')
# build columns
parsed_df['track_id'] = parsed_df['data'].apply(lambda x: x[0])
parsed_df['type'] = parsed_df['data'].apply(lambda x: x[1])
parsed_df['traveled_d'] = parsed_df['data'].apply(lambda x: x[2])
parsed_df['avg_speed'] = parsed_df['data'].apply(lambda x: x[3])
parsed_df['lat'] = parsed_df.apply(
lambda row: row['data'][4+(row['grouped_row_id']-1)*6], axis=1)
parsed_df['lon'] = parsed_df.apply(
lambda row: row['data'][5+(row['grouped_row_id']-1)*6], axis=1)
parsed_df['speed'] = parsed_df.apply(
lambda row: row['data'][6+(row['grouped_row_id']-1)*6], axis=1)
parsed_df['lon_acc'] = parsed_df.apply(
lambda row: row['data'][7+(row['grouped_row_id']-1)*6], axis=1)
parsed_df['lat_acc'] = parsed_df.apply(
lambda row: row['data'][8+(row['grouped_row_id']-1)*6], axis=1)
parsed_df['time'] = parsed_df.apply(
lambda row: row['data'][9+(row['grouped_row_id']-1)*6], axis=1)
# clean up columns
parsed_df = parsed_df.drop(columns=old_col)
parsed_df = parsed_df.drop(
columns=['count',
'grouped_row_id',
'data']
).reset_index(drop=True)
parsed_df = parsed_df.reset_index(drop=False).rename(
columns={'index': 'record_id'})
# output to file
parsed_df.to_csv(PARSED_DATA_FILE, index=False)
parsed_df.head(5)
else:
# read parsed data
parsed_df = pd.read_csv(PARSED_DATA_FILE, header=0,
skiprows=0, delimiter=None)
parsed_df['track_id'] = parsed_df['track_id'].astype(int)
# clean up unnamed index column - perhaps name it as record id?
# calculate orientation
## bearing using acceleration (do not use as it provides inaccurate bearing)
parsed_df['acc_angle'] = np.arctan2(parsed_df['lat_acc'],
parsed_df['lon_acc']) * 180 / np.pi # lon = x, lat = y
## approximate bearing using acceleration (do not use as it provides inaccurate bearing)
parsed_df['appr_acc_angle'] = parsed_df['acc_angle'].round(-1)
# https://stackoverflow.com/questions/1016039/determine-the-general-orientation-of-a-2d-vector
# https://numpy.org/doc/stable/reference/generated/numpy.arctan2.html
# np.arctan2(y, x) * 180 / np.pi
# compute x and y corrdinates
# this improves the ease of calculating distances, especially for clustering
from datashader.utils import lnglat_to_meters
parsed_df.loc[:, 'x'], parsed_df.loc[:, 'y'] = lnglat_to_meters(parsed_df.lon, parsed_df.lat)
# calculate bearing based on next position
shifted = parsed_df[['track_id', 'x', 'y']].\
groupby("track_id").\
shift(-1).\
rename(columns=lambda x: x+"_lag")
parsed_df = parsed_df.join(shifted)
# https://stackoverflow.com/questions/5058617/bearing-between-two-points
def gb(x1, x2, y1, y2):
angle = np.arctan2(y1 - y2, x1 - x2) * 180 / np.pi
# bearing1 = (angle + 360) % 360
bearing2 = (90 - angle) % 360
return(bearing2)
parsed_df['bearing'] = gb(
x1=parsed_df['x'],
x2=parsed_df['x_lag'],
y1=parsed_df['y'],
y2=parsed_df['y_lag'])
# impute bearing of first points
parsed_df = parsed_df.sort_values(
by='record_id', axis=0) # make sure record is in order
shifted = parsed_df[['track_id', 'bearing']].\
groupby("track_id").\
shift(1).\
rename(columns=lambda x: x+"_lead")
parsed_df = parsed_df.join(shifted)
# if bearing is null, take the previous bearing for the track id
parsed_df['bearing'] = np.where(parsed_df['bearing'].isnull(),
parsed_df['bearing_lead'], parsed_df['bearing'])
# there should be no more null bearing
parsed_df[parsed_df['bearing'].isnull()]#[['record_id','count']]
parsed_df.head(10)
len(parsed_df)
# speed vs time - 25 vehicles
dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
df=parsed_df[(parsed_df['track_id']>100) & (parsed_df['track_id']<105)]\
ax = sns.scatterplot(
x="time",
y="speed",
# hue="track_id",
marker='x',
s=0.2,
data=df)
# lat lon - 25 vehicles
dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
df = parsed_df[(parsed_df['track_id']>100) & (parsed_df['track_id']<125)]
ax = sns.scatterplot(
x="lon",
y="lat",
# hue="track_id",
marker='+',
s=1,
data=df)
# lat lon - all vehicles
dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
ax = sns.scatterplot(
x="lon",
y="lat",
#hue="track_id",
marker='x',
s=0.2,
data=parsed_df)
# lat lon - stopped only - speed <1
dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
df = parsed_df[parsed_df['speed']<1]
ax = sns.scatterplot(
x="lon",
y="lat",
#hue="track_id",
marker='x',
s=0.5,
data=df)
# lat lon - at a certain time frame with low speed
dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
df = parsed_df[(parsed_df['time'] == 0) & (parsed_df['speed'] < 1)]
ax = sns.scatterplot(
x="lon",
y="lat",
# hue="type",
# style="speed",
marker='x',
s=20,
data=df)
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
df = parsed_df.copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() *
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
# https://datashader.org/getting_started/Interactivity.html
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
from datashader.colors import Sets1to3
df = parsed_df.copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
plot = hd.datashade(points, aggregator=ds.count_cat('type')).opts(hv.opts(width=750, height=350))
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
color_key = [(name,color) for name,color in zip(['Car', 'Medium Vehicle', 'Motorcycle', 'Heavy Vehicle', 'Bus',
'Taxi'], Sets1to3)]
color_points = hv.NdOverlay({n: hv.Points(df.iloc[0:1,:], label=str(n)).opts(style=dict(color=c)) for n,c in color_key})
#tiles.StamenTerrain() *
tiles.CartoLight() * plot * color_points
# Car only
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
df = parsed_df[parsed_df['type']=='Car'].copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() *
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
# Buses only
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
df = parsed_df[parsed_df['type']=='Car'].copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() *
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
# ~ Stationary points only
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
df = parsed_df[(parsed_df['speed']==0)].copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() *
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
# ~ moving points only (>0)
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
df = parsed_df[(parsed_df['speed']>0)].copy()
df['track_id'] = df['track_id']
df['type'] = df['type']
df.loc[:, 'x'], df.loc[:, 'y'] = lnglat_to_meters(df.lon,df.lat)
df = df[['x', 'y', 'lon', 'lat', 'track_id', 'time', 'type']]
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() *
tiles.CartoLight() * hd.datashade(points).opts(hv.opts(width=750, height=350))
parsed_df.head(5)
parsed_df.describe()
# utility - hdbscan clustering
# https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html
import hdbscan
def cluster_hdbscan(df,
parameters=None,
feature_names=['x', 'y'],
label_name='unnamed_cluster',
verbose=True):
df = df.copy()
default_parameters = {
'metric': 'euclidean',
'min_cluster_size': 200,
'min_samples': None,
'cluster_selection_epsilon': 7
}
if(parameters == None):
parameters = default_parameters
else:
default_parameter_names = list(default_parameters.keys())
parameter_names = list(parameters.keys())
for parameter in default_parameter_names:
if(parameter not in parameter_names):
parameters[parameter] = default_parameters[parameter]
clusterer = hdbscan.HDBSCAN(
metric=parameters['metric'],
min_cluster_size=parameters['min_cluster_size'],
min_samples=parameters['min_samples'],
cluster_selection_epsilon=parameters['cluster_selection_epsilon']
)
clusterer.fit(df[feature_names])
df[label_name] = clusterer.labels_
if verbose:
print('hdbscan trained on: ' + str(parameters))
return(df)
# utility - dbscan clustering
from sklearn.cluster import DBSCAN
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
def cluster_dbscan(df,
parameters=None,
feature_names=['x', 'y'],
label_name='unnamed_cluster',
verbose=True):
df = df.copy()
# default_parameters = {
# 'metric': 'euclidean',
# 'min_cluster_size': 200,
# 'min_samples': None,
# 'cluster_selection_epsilon': 7
# }
clusterer = DBSCAN(
eps=parameters['cluster_selection_epsilon'],
min_samples=parameters['min_samples'],
).fit(df[feature_names])
df[label_name] = clusterer.labels_
if verbose:
print('dbscan trained on: ' + str(parameters))
return(df)
# utility - kmeans clustering
# https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html
from sklearn.cluster import KMeans
def cluster_kmeans(df,
n_clusters=4,
feature_names=['bearing_median'],
label_name='unnamed_cluster',
verbose=True):
df = df.copy()
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(
df[feature_names])
df[label_name] = kmeans.labels_
if verbose:
print('kmeans trained on: ' + str(n_clusters) +
" clusters and " + str(feature_names))
return(df)
Clustering roadway segments to identify apporach and major road / intersection.
# prep training data
df = parsed_df # [(parsed_df['speed']<5)].copy() # ~ bottom 75% speeds
df['record_id'] = df['record_id']
#df['type'] = df['type']
seg_all_df = df[['x', 'y', 'bearing',
'record_id']].set_index('record_id')
#seg_all_df = seg_all_df.head(100000)
# rounding is not a good idea
#seg_all_df['x'] = seg_all_df['x'].round(1)
#seg_all_df['y'] = seg_all_df['y'].round(1)
# set count
seg_all_df['count'] = 1
# get count and angle by unique location
seg_all_df = seg_all_df.\
groupby(['x', 'y']).\
agg({"count": np.sum, 'bearing': np.median}).\
reset_index()
# get total and pct of count
seg_all_df['total_count'] = seg_all_df['count'].sum()
seg_all_df['count_pct'] = seg_all_df['count'] / \
seg_all_df['total_count'] * 100
# save all data for unique points
seg_all_df = seg_all_df.reset_index(
drop=False).rename(columns={'index': 'u_id'}).set_index('u_id')
### DENSITY REDUCTION ###
# # filter out unique points with fewer than 0.05% of total points
# seg_all_df = seg_all_df[seg_all_df['count_pct'] > 0.05]
# # filter out unique points with fewer than 0.0001% of total points (1 in mil)
# seg_all_df = seg_all_df[seg_all_df['count_pct']>0.0002]
# filter out infreq points (points with less than 10 samples) for training
# this helps reduce data size and introduce breaks in low density areas of the data
seg_train_df = seg_all_df[seg_all_df['count'] > 10]
seg_infre_df = seg_all_df[seg_all_df['count'] <= 10]
# choose features to be trained on - not needed!
# seg_train_df = seg_train_df[['x', 'y', 'count', 'count_pct']]
# seg_train_df = seg_train_df[['x', 'y', 'bearing']]
# seg_train_df = seg_train_df[['x', 'y']]
# full dataset of unique points (all points)
seg_all_df
# training dataset of unique points (only frequent points)
seg_train_df
# infrequent data points excluded from training
seg_infre_df
# visual inspect - lat lon
dims = (10, 6)
fig, ax = plt.subplots(figsize=dims)
ax = sns.scatterplot(
x='x',
y='y',
s=1,
palette="black",
# hue="count",
# style="speed",
marker='+',
edgecolors='red',
data=seg_train_df.copy())
# visual inspect - rasterize lat lon
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
# https://datashader.org/getting_started/Interactivity.html
from datashader.colors import Sets1to3
# https://github.com/holoviz/datashader/issues/767
import colorcet as cc
long_key = list(set(cc.glasbey_cool + cc.glasbey_warm + cc.glasbey_dark))
df = seg_train_df.copy()
#df['seg_cluster'] = df['seg_cluster'].apply(lambda x: 0 if x >=0 else -1)
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() *
tiles.CartoLight() * hd.datashade(points,
#aggregator=ds.count_cat('seg_cluster'),
#color_key=long_key
).opts(hv.opts(width=750, height=350))
#hd.dynspread(hd.datashade(points,
# aggregator=ds.count_cat('seg_cluster'), d
# color_key=Sets1to3).opts(hv.opts(width=750, height=350)), threshold=0.4)
# define subclustering parameters
seg_cluster_parameter = {
# x clusters of medium size segments
'metric': 'euclidean',
'min_cluster_size': 150,
'min_samples': None,
'cluster_selection_epsilon': 5
# # 8 clusters of medium size segments
# 'metric': 'euclidean',
# 'min_cluster_size': 200,
# 'min_samples': None,
# 'cluster_selection_epsilon': 20
# # 7 clusters of medium size segments
# 'metric'='euclidean',
# 'min_cluster_size'=300,
# 'min_samples'=None,
# 'cluster_selection_epsilon'=10
# # 12 clusters of fine segments
# 'metric'='euclidean',
# 'min_cluster_size'=150,
# 'min_samples'=None,
# 'cluster_selection_epsilon'=5
}
# run subclustering for lanes
seg_train_df_1 = cluster_hdbscan(df=seg_train_df,
parameters=seg_cluster_parameter,
feature_names=['x', 'y'],
label_name='seg_cluster')
len(seg_train_df_1['seg_cluster'].unique())
# visual inspect clusters by facet plot
# https://seaborn.pydata.org/generated/seaborn.FacetGrid.html
g = sns.FacetGrid(seg_train_df_1, col='seg_cluster', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=0.1)#, edgecolor="w")
# note: cluster 3 and 8&9 are of interest, manually merge 8 and 9
### HDBSCAN parameter tuning decisions ###
# https://hdbscan.readthedocs.io/en/latest/parameter_selection.html
# min_samples
# opt A for decision for min_sample for core points
# ~ 400*600m^2 (240,000 m^2 area)
# ~ 1 mil unique points (745,709 if no lone points)
# avg density of points or minimum eligible density should be ~ 5 points
# opt B for decision for min_sample for core points
# net area is ~ (based on rough calculation of roadway areas)
# 50*600 + 30*400 + 4 * 10*400 = 58,000
# ~ 1 mil unique points (745,709 if no lone points)
# avg density of points or minimum eligible density should be ~ 15 points
# option A or B generates way too many clusters, gradully increase min_samples for core points until less clusters are generated
# cluster_selection_epsilon
# 1.5m radius (or 3.0m width) is approx. lane width, use 5m for a typ. 3 lane roadway
# HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True,
# gen_min_span_tree=False, leaf_size=40, memory=Memory(cachedir=None),
# metric='euclidean', min_cluster_size=5, min_samples=None, p=None)
# prepare data for second-stage training
seg_train_df_2 = seg_train_df_1[seg_train_df_1['seg_cluster']==-1]
seg_train_df_2
# 2nd stage dbscan clustering
# with more relax parameters on unclustered point from 1st stage only
# define subclustering parameters
seg_cluster_parameter = {
# x clusters of medium size segments
'metric': 'euclidean',
'min_cluster_size': 75,
'min_samples': None,
'cluster_selection_epsilon': 5
}
# run subclustering for lanes
seg_train_df_2 = cluster_hdbscan(df=seg_train_df_2,
parameters=seg_cluster_parameter,
feature_names=['x', 'y'],
label_name='seg_cluster')
# seg_train_df_2[seg_train_df_2['seg_cluster']==-1]
# clustered from stage 1
seg_a = seg_train_df_1[seg_train_df_1['seg_cluster'] != -1].copy()
# clustered from stage 2
seg_b = seg_train_df_2[seg_train_df_2['seg_cluster'] != -1].copy()
prev_max = seg_train_df_1[seg_train_df_1['seg_cluster']
!= -1]['seg_cluster'].max()
seg_b['seg_cluster'] = seg_b['seg_cluster'] + \
prev_max + 1 # increment cluster number
# unclustered
seg_c = seg_train_df_2[seg_train_df_2['seg_cluster'] == -1].copy()
# update training data
seg_train_df = pd.concat([seg_a,seg_b,seg_c])
# visual inspect clusters by facet plot
# https://seaborn.pydata.org/generated/seaborn.FacetGrid.html
g = sns.FacetGrid(seg_train_df_2, col='seg_cluster', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=0.1)#, edgecolor="w")
# cluster 6+12 = 18 is of interest
# visual inspect clusters by facet plot
# https://seaborn.pydata.org/generated/seaborn.FacetGrid.html
g = sns.FacetGrid(seg_train_df, col='seg_cluster', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=0.1)#, edgecolor="w")
# selecting only clusters of interests:
# -Cluster 8 + 9 (E-W road),
# -Cluster 3 (N-S road in the NW corner),
# -Cluster 18 (turning lane from South road to E-W road)
seg_train_df['seg_cluster_combined'] = seg_train_df['seg_cluster'].\
apply(lambda x: 'A' if ((x == 'A') | (x == 8) | (x == 9))
else ('B' if ((x == 'B') | (x == 3)) else
'C' if ((x == 'C') | (x == 18))
else 'Exclude'
)
)
# remove cluster to be excluded, assign combined cluster as final cluster
seg_train_df = seg_train_df[seg_train_df['seg_cluster_combined'].
isin(['A', 'B', 'C'])]
seg_train_df['seg_cluster'] = seg_train_df['seg_cluster_combined']
seg_train_df = seg_train_df.drop(columns=['seg_cluster_combined'])
seg_train_df.groupby(['seg_cluster']).count()
# visual inspect clusters by map - color by clusters
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
# https://datashader.org/getting_started/Interactivity.html
from datashader.colors import Sets1to3
# https://github.com/holoviz/datashader/issues/767
import colorcet as cc
long_key = list(set(cc.glasbey_cool + cc.glasbey_warm + cc.glasbey_dark))
df = seg_train_df#[seg_train_df['seg_cluster']>=0].copy()
# df = seg_train_df[seg_train_df['seg_cluster']==0].copy()
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() *
tiles.CartoLight() * hd.datashade(points,
aggregator=ds.count_cat('seg_cluster'),
color_key=Sets1to3).opts(hv.opts(width=750, height=350))
#tiles.CartoLight() * hd.dynspread(hd.datashade(points,
# aggregator=ds.count_cat('seg_cluster'),
# color_key=long_key).opts(hv.opts(width=750, height=350)), threshold=0.4)
recover some nearby points not classified
# reassignment part A for unclustered points
# approach #1
# - every point within an existing cluster is used as a core point for cluster reassignment
# - this approach require a lot more distance computations
seg_train_df_0 = seg_train_df[seg_train_df['seg_cluster'] == -1].\
reset_index(drop=False).\
rename(columns={'u_id': 'u_id'})
seg_train_df_1 = seg_train_df[seg_train_df['seg_cluster'] != -1].\
reset_index(drop=False).\
rename(columns={'u_id': 'u_id_clustered'})
seg_train_df_0 = seg_train_df_0.drop(columns=['seg_cluster'])
seg_train_df_1 = seg_train_df_1.\
rename(columns={'x': 'x_clustered', 'y': 'y_clustered'}).\
drop(columns=['count', 'bearing', 'total_count', 'count_pct'])
seg_train_df_0['tmp'] = 1
seg_train_df_1['tmp'] = 1
len(seg_train_df_0)
len(seg_train_df_1)
# build intermediate dataframe
# https://stackoverflow.com/questions/35234012/python-pandas-merge-two-tables-without-keys-multiply-2-dataframes-with-broadc
seg_train_df_reassign_a = pd.merge(seg_train_df_0, seg_train_df_1, on=['tmp']).drop(columns='tmp')
# calculate Euclidean distance
# more resources for more complex examples: https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/
def e_dist(x1, x2, y1, y2):
return np.sqrt((x1-x2) ** 2+(y1-y2) ** 2)
df = seg_train_df_reassign_a
df['dist'] = e_dist(
x1=df['x_clustered'],
x2=df['x'],
y1=df['y_clustered'],
y2=df['y'])
# get minimum distance in each group
idx = df.groupby(['u_id'])['dist'].transform(min) == df['dist']
# save results
seg_reassigned_df_a = df.copy()
seg_reassigned_idx_a = idx
# limit on reassigning unclustered points
reassign_dist_limit = 20 # meters
seg_unclustered_df = seg_reassigned_df_a[seg_reassigned_idx_a]
# limit max distance to 20 meters
seg_unclustered_df = seg_unclustered_df[seg_unclustered_df['dist'] < reassign_dist_limit]
seg_unclustered_df = seg_unclustered_df.set_index('u_id')
seg_unclustered_df = seg_unclustered_df[list(seg_train_df.columns)]
seg_unclustered_df
len(seg_train_df)
seg_train_df_final = pd.concat(
[seg_train_df[seg_train_df['seg_cluster'] != -1], seg_unclustered_df])
seg_train_df_final
# build convex hull to recapture raw gps points
# https://stackoverflow.com/questions/60194404/how-to-make-a-polygon-shapefile-which-corresponds-to-the-outer-boundary-of-the-g
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.ConvexHull.html
# # scipy convex hull example
# from scipy.spatial import ConvexHull, convex_hull_plot_2d
# import matplotlib.pyplot as plt
# # hull 1
# points = np.random.rand(30, 2) # 30 random points in 2-D
# hull = ConvexHull(points)
# plt.plot(points[:,0], points[:,1], 'o')
# for simplex in hull.simplices:
# plt.plot(points[simplex, 0], points[simplex, 1], 'k-')
# # hull 2
# points = np.random.rand(30, 2) # 30 random points in 2-D
# plt.plot(points[:,0], points[:,1], 'o')
# hull = ConvexHull(points)
# for simplex in hull.simplices:
# plt.plot(points[simplex, 0], points[simplex, 1], 'k-')
# taking a look at convex hull without unclustered points
# this can be thought of as congested areas
df = seg_train_df[seg_train_df['seg_cluster'] != -1].\
reset_index(drop=False).\
rename(columns={'u_id': 'u_id_clustered'})
# build an dictionary of convex hull points
cluster_pt_dict = {}
for cluster in df['seg_cluster'].unique():
cluster_pt_dict[cluster] = df[
df['seg_cluster'] == cluster][['x', 'y']].to_numpy()
def get_convex_hull_indices(pts_array):
hull = ConvexHull(pts_array)
hull_indices = np.unique(hull.simplices.flat)
hull_pts = pts_array[hull_indices, :]
return(hull_pts)
# get convex hull
cluster_hull_dict = {}
for cluster in list(cluster_pt_dict.keys()):
cluster_hull_dict[cluster] = get_convex_hull_indices(
cluster_pt_dict[cluster])
# plot
for cluster in list(cluster_pt_dict.keys()):
plt.plot(cluster_pt_dict[cluster][:, 0],
cluster_pt_dict[cluster][:, 1], ',')
hull = ConvexHull(cluster_pt_dict[cluster])
for simplex in hull.simplices:
plt.plot(cluster_pt_dict[cluster][simplex, 0],
cluster_pt_dict[cluster][simplex, 1], 'k-')
# taking a look at convex hull with unclustered points
# this can be viewed as extended congested areas
df = seg_train_df_final.copy()
# build an dictionary of convex hull points
cluster_pt_dict = {}
for cluster in df['seg_cluster'].unique():
cluster_pt_dict[cluster] = df[
df['seg_cluster'] == cluster][['x', 'y']].to_numpy()
def get_convex_hull_indices(pts_array):
hull = ConvexHull(pts_array)
hull_indices = np.unique(hull.simplices.flat)
hull_pts = pts_array[hull_indices, :]
return(hull_pts)
# get convex hull objects
cluster_hull_objs = {}
for cluster in list(cluster_pt_dict.keys()):
cluster_hull_objs[cluster] = ConvexHull(cluster_pt_dict[cluster])
# get convex hull indice points
cluster_hull_dict = {}
for cluster in list(cluster_pt_dict.keys()):
cluster_hull_dict[cluster] = get_convex_hull_indices(
cluster_pt_dict[cluster])
# plot
for cluster in list(cluster_pt_dict.keys()):
plt.plot(cluster_pt_dict[cluster][:, 0],
cluster_pt_dict[cluster][:, 1], ',')
hull = ConvexHull(cluster_pt_dict[cluster])
for simplex in hull.simplices:
plt.plot(cluster_pt_dict[cluster][simplex, 0],
cluster_pt_dict[cluster][simplex, 1], 'k-')
# build a convex hull points df from the entire training set (incl. unclustered points)
cluster_hull_list_df = []
for cluster in list(cluster_hull_dict.keys()):
label = cluster
df = pd.DataFrame(cluster_hull_dict[cluster], columns=['x', 'y'])
df['seg_cluster'] = label
cluster_hull_list_df.append(df)
cluster_hull_df = pd.concat(cluster_hull_list_df)
cluster_hull_df
# https://stackoverflow.com/questions/16750618/whats-an-efficient-way-to-find-if-a-point-lies-in-the-convex-hull-of-a-point-cl/16898636#16898636
def in_hull(p, hull):
"""
Test if points in `p` are in `hull`
`p` should be a `NxK` coordinates of `N` points in `K` dimensions
`hull` is either a scipy.spatial.Delaunay object or the `MxK` array of the
coordinates of `M` points in `K`dimensions for which Delaunay triangulation
will be computed
"""
from scipy.spatial import Delaunay
if not isinstance(hull, Delaunay):
hull = Delaunay(hull)
return hull.find_simplex(p) >= 0
# iterate over convex hull objects and match points
cluster_hull_objs.keys()
df = parsed_df.copy()
df['x_id'] = df['x'] * 100
df['x_id'] = df['x_id'].astype(int)
df['y_id'] = df['y'] * 100
df['y_id'] = df['y_id'].astype(int)
# save ids to parsed_df
parsed_df = df.copy()
df['count'] = 1
# get count and angle by unique location
df = df.\
groupby(['x', 'y', 'x_id', 'y_id']).\
agg({"count": np.sum, 'bearing': np.median}).\
rename(columns={'bearing': 'bearing_median'}).\
reset_index()
all_cluster_cols = []
cluster_keys = list(cluster_hull_dict.keys())
cluster_keys.sort()
for cluster_hull in cluster_keys:
col_name = "cluster_{}".format(str(cluster_hull))
all_cluster_cols.append(col_name)
df[col_name] = in_hull(
p=df[['x', 'y']].to_numpy(),
hull=cluster_hull_dict[cluster_hull])
df.loc[df[col_name]==True, 'seg_cluster'] = str(cluster_hull)
df = df.drop(columns=[col_name])
# merge id table with name table
# use all points - allow duplicate identicals
# clustered_df = parsed_df.merge(
# df.drop(columns=['x', 'y']), on=['x_id', 'y_id'])
# use only unique points - disallow identicals
seg_train_df_final = df.copy()
seg_train_df_final['seg_cluster'] = seg_train_df_final['seg_cluster'].astype(str)
(Instead of Directional due to restricted scope in analysis area)
seg_train_df_final_bk = seg_train_df_final.copy()
seg_train_df_final = seg_train_df_final_bk.copy()
# filter out infreq points (points with less than 2 samples) for training
# this helps reduce data size and introduce breaks in low density areas of the data
seg_train_df_final = seg_train_df_final[seg_train_df_final['count'] > 2]
seg_train_df_infre = seg_train_df_final[seg_train_df_final['count'] <= 2]
cluster_list = seg_train_df_final['seg_cluster'].unique()
# cluster_list = [1]
cluster_list
seg_train_df_final = seg_train_df_final[seg_train_df_final['seg_cluster']!='nan']
len(seg_train_df_final)
cluster_list = seg_train_df_final['seg_cluster'].unique()
# cluster_list = [1]
cluster_list
# prepare data
seg_train_df_final_dict = dict((key, seg_train_df_final[seg_train_df_final['seg_cluster'] == key])
for key in cluster_list)
# # run subclustering for direction - kmeans
# subcluster_parameters = {
# 'A': {
# 'n_clusters': 4
# },
# 'B': {
# 'n_clusters': 1
# },
# 'C': {
# 'n_clusters': 3
# }
# }
# subcluster_results = dict((key,
# cluster_kmeans(df=seg_train_df_final_dict[key],
# n_clusters=subcluster_parameters[key]['n_clusters'],
# feature_names=['bearing_median'],
# label_name='dir_cluster')
# )
# for key in cluster_list)
# # # run subclustering for direction - hdbscan
# # define subclustering parameters
# subcluster_parameters = {
# 'A': {
# 'metric': 'euclidean',
# 'min_cluster_size': 1000,
# 'min_samples': 100,
# 'cluster_selection_epsilon': 1
# },
# 'B': {
# 'metric': 'euclidean',
# 'min_cluster_size': 1000,
# 'min_samples': 100,
# 'cluster_selection_epsilon': 1
# },
# 'C': {
# 'metric': 'euclidean',
# 'min_cluster_size': 1000,
# 'min_samples': 100,
# 'cluster_selection_epsilon': 1
# }
# }
# # run subclustering for lanes
# subcluster_results = dict((key,
# cluster_hdbscan(df=seg_train_df_final_dict[key],
# parameters=subcluster_parameters[key],
# feature_names=['x', 'y'],
# label_name='dir_cluster')
# )
# for key in cluster_list)
lane_parameter = {
'A': {
'min_samples': 100,
'cluster_selection_epsilon': 1
},
'B': {
'min_samples': 100,
'cluster_selection_epsilon': 1
},
'C': {
'min_samples': 50,
'cluster_selection_epsilon': 1
}
}
subcluster_results = dict((key, cluster_dbscan(df=seg_train_df_final_dict[key],
parameters=lane_parameter[key],
feature_names=['x', 'y'],
label_name='dir_cluster',
verbose=False)) for key in cluster_list)
subcluster_results_df = pd.concat(list(subcluster_results.values()))
# # filter out "outliers" within the cluster
# subcluster_results_df = subcluster_results_df[subcluster_results_df['lane_subcluster']!=-1]
subcluster_results_df['seg_dir_cluster'] = subcluster_results_df['seg_cluster'].astype(
str) + "_" + subcluster_results_df['dir_cluster'].astype(str)
len(subcluster_results_df)
min_cluster_size = 150 # for seg dir cluster, if not met, cluster is deleted
checksum = subcluster_results_df.groupby(['seg_dir_cluster']).count()
exclude = checksum[checksum<min_cluster_size].dropna().reset_index()
# exclude
subcluster_results_df = subcluster_results_df[~subcluster_results_df['seg_dir_cluster'].isin(
exclude['seg_dir_cluster'])].copy()
exclude
len(subcluster_results_df)
# taking a look at convex hull with directions
# this can be viewed as extended congested areas
df = subcluster_results_df.copy()
# build an dictionary of convex hull points
cluster_pt_dict = {}
for cluster in df['seg_dir_cluster'].unique():
cluster_pt_dict[cluster] = df[
df['seg_dir_cluster'] == cluster][['x', 'y']].to_numpy()
def get_convex_hull_indices(pts_array):
hull = ConvexHull(pts_array)
hull_indices = np.unique(hull.simplices.flat)
hull_pts = pts_array[hull_indices, :]
return(hull_pts)
# get convex hull objects
cluster_hull_objs = {}
for cluster in list(cluster_pt_dict.keys()):
cluster_hull_objs[cluster] = ConvexHull(cluster_pt_dict[cluster])
# get convex hull indice points
cluster_hull_dict = {}
for cluster in list(cluster_pt_dict.keys()):
cluster_hull_dict[cluster] = get_convex_hull_indices(
cluster_pt_dict[cluster])
# plot
for cluster in list(cluster_pt_dict.keys()):
plt.plot(cluster_pt_dict[cluster][:, 0],
cluster_pt_dict[cluster][:, 1], ',')
hull = ConvexHull(cluster_pt_dict[cluster])
for simplex in hull.simplices:
plt.plot(cluster_pt_dict[cluster][simplex, 0],
cluster_pt_dict[cluster][simplex, 1], 'k-')
# check size of clusters
subcluster_results_df.groupby('seg_dir_cluster').count()
points.array()[0]
# build seg_dir_lane_cluster from visual inspection
# this effectively clean up the clustering result and get rid of clusters that aren't meaningful
subcluster_results_df['seg_dir_lane_cluster'] = subcluster_results_df['seg_dir_cluster'].\
apply(lambda x:
'Green_1' if ((x == 'B_3') | (x == 'B_4') | (x == 'B_5') | (x == 'B_6'))
else (
'Green_2' if ((x == 'B_0'))
else (
'Green_3' if ((x == 'B_1') | (x == 'B_2'))
else(
'Yellow_1' if ((x == 'C_0'))
else (
'Yellow_2' if ((x == 'C_1'))
else (
'Yellow_3' if ((x == 'C_2'))
else (
'Red_1' if ((x == 'A_0') | (x == 'A_3'))
else (
'Red_2' if ((x == 'A_1') | (x == 'A_7') | (x == 'A_8') | (x == 'A_15'))
else (
'Red_3' if ((x == 'A_2') | (x == 'A_9'))
else(
'Exclude'
)
)
)
)
)
)
)
)
))
subcluster_results_df = subcluster_results_df[subcluster_results_df['seg_dir_lane_cluster']!='Exclude']
# visual inspect clusters by map - color by clusters
# https://datashader.org/user_guide/Geography.html
# https://holoviews.org/reference/elements/bokeh/Tiles.html
## hv.element.tiles.tile_sources
from holoviews.element import tiles
from datashader.utils import lnglat_to_meters
# https://datashader.org/getting_started/Interactivity.html
from datashader.colors import Sets1to3
# https://github.com/holoviz/datashader/issues/767
import colorcet as cc
long_key = list(set(cc.glasbey_cool + cc.glasbey_warm + cc.glasbey_dark))
df = subcluster_results_df#[seg_train_df['seg_cluster']>=0].copy()
# df = subcluster_results_df[subcluster_results_df['dir_cluster']>=0].copy()
# df = subcluster_results_df[subcluster_results_df['seg_dir_cluster'].isin(['A_0', 'A_1', 'A_2', 'A_3', 'A_4', 'A_5', 'A_6', 'A_7', 'A_8', 'A_9', 'A_10', 'A_11', 'A_12', 'A_13', 'A_14', 'A_15', 'A_16', 'A_17', 'A_18'])].copy()
# df = subcluster_results_df[subcluster_results_df['seg_dir_cluster'].isin(['A_0', 'A_3', 'A_10'])].copy()
# df = seg_train_df[seg_train_df['seg_cluster']==0].copy()
points = hv.Points(df.copy())
hv.extension('bokeh')
hv.output(backend='bokeh')
#tiles.EsriImagery() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrainRetina() * hd.datashade(points).opts(hv.opts(width=750, height=350))
#tiles.StamenTerrain() *
tiles.CartoLight() * hd.datashade(points,
aggregator=ds.count_cat('seg_dir_lane_cluster'),
color_key=long_key).opts(hv.opts(width=750, height=350)) #* color_points
#tiles.CartoLight() * hd.dynspread(hd.datashade(points,
# aggregator=ds.count_cat('seg_cluster'),
# color_key=long_key).opts(hv.opts(width=750, height=350)), threshold=0.4)
# visual inspect clusters by facet plot
# https://seaborn.pydata.org/generated/seaborn.FacetGrid.html
# subcluster_results_df['tmp'] = 1
g = sns.FacetGrid(
subcluster_results_df,
hue='seg_dir_lane_cluster',
col_wrap=5,
height=4,
legend_out=True,
# col='tmp'
col='seg_dir_lane_cluster'
)
g = g.map(plt.scatter, 'x', 'y', s=0.05, marker='.') # , edgecolor="w")
# taking a look at convex hull with lane and directions
# this can be viewed as extended congested areas
df = subcluster_results_df.copy()
# build an dictionary of convex hull points
cluster_pt_dict = {}
for cluster in df['seg_dir_lane_cluster'].unique():
cluster_pt_dict[cluster] = df[
df['seg_dir_lane_cluster'] == cluster][['x', 'y']].to_numpy()
def get_convex_hull_indices(pts_array):
hull = ConvexHull(pts_array)
hull_indices = np.unique(hull.simplices.flat)
hull_pts = pts_array[hull_indices, :]
return(hull_pts)
# get convex hull objects
cluster_hull_objs = {}
for cluster in list(cluster_pt_dict.keys()):
cluster_hull_objs[cluster] = ConvexHull(cluster_pt_dict[cluster])
# get convex hull indice points
cluster_hull_dict = {}
for cluster in list(cluster_pt_dict.keys()):
cluster_hull_dict[cluster] = get_convex_hull_indices(
cluster_pt_dict[cluster])
# plot
for cluster in list(cluster_pt_dict.keys()):
plt.plot(cluster_pt_dict[cluster][:, 0],
cluster_pt_dict[cluster][:, 1], ',')
hull = ConvexHull(cluster_pt_dict[cluster])
for simplex in hull.simplices:
plt.plot(cluster_pt_dict[cluster][simplex, 0],
cluster_pt_dict[cluster][simplex, 1], 'k-')
# https://stackoverflow.com/questions/16750618/whats-an-efficient-way-to-find-if-a-point-lies-in-the-convex-hull-of-a-point-cl/16898636#16898636
def in_hull(p, hull):
"""
Test if points in `p` are in `hull`
`p` should be a `NxK` coordinates of `N` points in `K` dimensions
`hull` is either a scipy.spatial.Delaunay object or the `MxK` array of the
coordinates of `M` points in `K`dimensions for which Delaunay triangulation
will be computed
"""
from scipy.spatial import Delaunay
if not isinstance(hull, Delaunay):
hull = Delaunay(hull)
return hull.find_simplex(p) >= 0
# iterate over convex hull objects and match points
cluster_hull_objs.keys()
df = parsed_df.copy()
df['x_id'] = df['x'] * 100
df['x_id'] = df['x_id'].astype(int)
df['y_id'] = df['y'] * 100
df['y_id'] = df['y_id'].astype(int)
# save ids to parsed_df
parsed_df = df.copy()
df['count'] = 1
# get count and angle by unique location
df = df.\
groupby(['x', 'y', 'x_id', 'y_id']).\
agg({"count": np.sum, 'bearing': np.median}).\
rename(columns={'bearing': 'bearing_median'}).\
reset_index()
all_cluster_cols = []
cluster_keys = list(cluster_hull_dict.keys())
cluster_keys.sort()
for cluster_hull in cluster_keys:
col_name = "cluster_{}".format(str(cluster_hull))
all_cluster_cols.append(col_name)
df[col_name] = in_hull(
p=df[['x', 'y']].to_numpy(),
hull=cluster_hull_dict[cluster_hull])
df.loc[df[col_name]==True, 'seg_dir_lane_cluster'] = str(cluster_hull)
df = df.drop(columns=[col_name])
# merge id table with name table
# use all points - allow duplicate identicals
# clustered_df = parsed_df.merge(
# df.drop(columns=['x', 'y']), on=['x_id', 'y_id'])
# use only unique points - disallow identicals
subcluster_results_df = df.copy()
subcluster_results_df['seg_dir_lane_cluster'] = subcluster_results_df['seg_dir_lane_cluster'].astype(str)
# remove nan
subcluster_results_df = subcluster_results_df[subcluster_results_df['seg_dir_lane_cluster']!='nan']
Merge seg_dir_lane_cluster with all applicable data points
df = subcluster_results_df.copy()
df['x_id'] = df['x'] * 100
df['x_id'] = df['x_id'].astype(int)
df['y_id'] = df['y'] * 100
df['y_id'] = df['y_id'].astype(int)
subcluster_results_df = df[['x_id', 'y_id', 'seg_dir_lane_cluster']].copy()
subcluster_results_df
clustered_df = parsed_df.merge(
subcluster_results_df, on=['x_id', 'y_id']).\
sort_values(by='record_id', axis=0) # make sure record is in order
clustered_df
clustered_df.groupby(['seg_dir_lane_cluster']).count()
########### End of Clustering Models for Roadway Geometries ###########
# for each cluster, and time step
# cluster queues based on DBSCAN, set a avg speed eligibility ~ 10kph (no tailgating) - if a whole set of points are fast but close, ignore
#
# find queue length based on furthest point algorithm function - these points are start and end of queues
#
# define speed threshold
speed_threshold = 10
import math
# create time bin for the clustered df data
# calculate time bin based on max and min values, then do every x seconds
x_sec_bin = 0.02 # step size - shouldn't be too large, if 0.02, no bin
min_time = min(clustered_df['time'])
max_time = max(clustered_df['time'])
if(x_sec_bin <= 0.02):
clustered_df['time_bin'] = clustered_df['time'].copy()
else:
clustered_df['time_bin'] = x_sec_bin * \
np.round(clustered_df['time']/x_sec_bin, 0)
# a quick analysis on the count by time bins
clustered_df['count'] = 1
cluster_time_df = clustered_df[clustered_df['speed'] < speed_threshold].\
groupby(['seg_dir_lane_cluster', 'time_bin']).\
agg({'count': np.sum}).\
reset_index()
# cluster_time_df = cluster_time_df[cluster_time_df['count'] > 1]
# for testing, use whole seconds only
# wholoe_seconds_only = ~(cluster_time_df['time'].astype(int) < cluster_time_df['time'])
# cluster_time_df = cluster_time_df[wholoe_seconds_only]
# cluster_time_list = cluster_time_df.\
# drop(columns=['count']).\
# to_numpy()
# # len(cluster_time_list)
# cluster_time_df[cluster_time_df['seg_dir_lane_cluster'] == '7_3'] # ['count'].
max(cluster_time_df['count'])
min(cluster_time_df['count']) # let's get rid of these groups, they won't have any queues
len(clustered_df)
clustered_df_eval = clustered_df.merge(cluster_time_df.rename(
columns={'count': 'time_bin_count'}), on=['seg_dir_lane_cluster', 'time_bin'])
# exclude cluster and time points with no more than 1 sample
clustered_df_eval = clustered_df_eval[clustered_df_eval['time_bin_count'] > 1]
# exclude points that are moving faster than 10 kph
clustered_df_eval = clustered_df_eval[clustered_df_eval['speed'] < speed_threshold]
clustered_df_eval.groupby(['seg_dir_lane_cluster']).count()
# [x for i, x in df.groupby(level=0, sort=False)]
cluster_df_eval_list = [x for i, x in clustered_df_eval.groupby(['seg_dir_lane_cluster', 'time_bin'], sort=False)]
cluster_df_eval_list[0]['time']
# congestion_parameter = {
# 'metric': 'euclidean',
# 'min_cluster_size': 2,
# 'min_samples': 2,
# 'cluster_selection_epsilon': 15
# }
# df = cluster_hdbscan(df=cluster_df_eval_list[96],
# parameters=congestion_parameter,
# feature_names=['x', 'y'],
# label_name='cong_flag',
# verbose=False)
congestion_parameter = {
'min_samples': 2,
'cluster_selection_epsilon': 20
}
df = cluster_dbscan(df=cluster_df_eval_list[96],
parameters=congestion_parameter,
feature_names=['x', 'y'],
label_name='cong_flag')
df
g = sns.FacetGrid(df, col='cong_flag', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=10, marker='.')#, edgecolor="w")
# run clustering for congestion for lanes
# congestion_parameter = {
# 'metric': 'euclidean',
# 'min_cluster_size': 2,
# 'min_samples': 2,
# 'cluster_selection_epsilon': 15
# }
# cong_cluster_df_eval_list = [(cluster_hdbscan(df=df,
# parameters=congestion_parameter,
# feature_names=['x', 'y'],
# label_name='cong_flag',
# verbose=False)
# )
# for df in cluster_df_eval_list]
congestion_parameter = {
'min_samples': 2,
'cluster_selection_epsilon': 20
}
cong_cluster_df_eval_list = [(cluster_dbscan(df=df,
parameters=congestion_parameter,
feature_names=['x', 'y'],
label_name='cong_flag',
verbose=False)
)
for df in cluster_df_eval_list]
# visual checks
g = sns.FacetGrid(cong_cluster_df_eval_list[60], col='cong_flag', col_wrap=5, height=4)
g = g.map(plt.scatter, 'x', 'y', s=10, marker='.')#, edgecolor="w")
# combine results from clustering
cong_cluster_df_result = pd.concat(cong_cluster_df_eval_list)
# remove all outliers (not dense enough to qualify as queues)
cong_cluster_df_result = cong_cluster_df_result[cong_cluster_df_result['cong_flag'] != -1]
cong_cluster_df_result.groupby(['seg_dir_lane_cluster']).count()
cong_cluster_df_result
# build intermediate dataframe
# https://stackoverflow.com/questions/35234012/python-pandas-merge-two-tables-without-keys-multiply-2-dataframes-with-broadc
# calculate Euclidean distance
# more resources for more complex examples: https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/
def e_dist(x1, x2, y1, y2):
return np.sqrt((x1-x2) ** 2+(y1-y2) ** 2)
def getQueue(df):
"""This function requires the dataframe input to be groupped into appropriate clusters"""
df1 = df.copy()
df1['tmp'] = 1
if len(df1) > 0:
# put data in list
df_dist = pd.merge(df1, df1, on=['tmp'], suffixes=(
'_1', '_2'))
df_dist['dist'] = e_dist(
x1=df_dist['x_1'],
x2=df_dist['x_2'],
y1=df_dist['y_1'],
y2=df_dist['y_2'])
# get maximum distance in each group
# idx = df_dist.groupby(['cong_flag_1'])['dist'].transform(max) == df_dist['dist']
idx = df_dist['dist'].max() == df_dist['dist']
# keeping the first is good enough - idx will return 2 copy
result = df_dist[idx].iloc[0]
return result
# queue_calc_eval_list = [x for i, x in cong_cluster_df_result.groupby(
# ['seg_dir_lane_cluster', 'time_bin', 'cong_flag'], sort=False)]
# # test
# getQueue(queue_calc_eval_list[0])
ct_queue_calc_result = cong_cluster_df_result.groupby(
['seg_dir_lane_cluster', 'time_bin', 'cong_flag']).apply(lambda grp: getQueue(grp))
ct_queue_calc_result_final = ct_queue_calc_result.\
reset_index()\
[['seg_dir_lane_cluster', 'time_bin', 'cong_flag', 'record_id_1',
'record_id_2', 'lat_1', 'lon_1', 'lat_2', 'lon_2', 'dist']]
# for each row is a recorded queue
# where
# seg_dir_lane_cluster is the road segment direction and lane cluster group
# time_bin is the time stamp
# cong_flag is the queue number
# record_id_1 record id of the track and time of the start of the queue
# lat_1 is the latitude of start of the queue
# lon_1 is the longitude of start of the queue
# record_id_2 record id of the track and time of the end of the queue
# lat_2 is the latitude of end of the queue
# lon_2 is the longitude of end of the queue
# dist is the queue length
# save all the queues
ct_queue_calc_result_final.to_csv('uas4t_tl_team_queue-revised.csv', index=False)
# for each cluster, find the max queue length
# then report
## i. Maximum length of queue,
## ii. Lane the maximum length occurred,
## iii. Coordinates of the start and end of the maximum queue,
## iv. Timestamp of the maximum queue occurrence, and v. whether, when and where a spillback is formed (when applicable).
ct_queue_calc_result_final
# max queue by cluster
max_dist = ct_queue_calc_result_final.\
groupby(['seg_dir_lane_cluster']).\
agg({'dist': np.max}).\
rename(columns={'dist': 'max_queue_length'}).\
reset_index()
max_queue_df = ct_queue_calc_result_final.merge(max_dist, on='seg_dir_lane_cluster')
max_queue_df = max_queue_df[max_queue_df['max_queue_length'] == max_queue_df['dist']]
max_queue_df.to_csv('uas4t_tl_team_results-revised.csv', index=False)
list(max_queue_df.columns)
# for each row is a recorded max queue per cluster over one or more time interval
# where
# seg_dir_lane_cluster is the road segment direction cluster group
# time_bin is the time stamp
# cong_flag is the queue number
# record_id_1 record id of the track and time of the start of the queue
# lat_1 is the latitude of start of the queue
# lon_1 is the longitude of start of the queue
# record_id_2 record id of the track and time of the end of the queue
# lat_2 is the latitude of end of the queue
# lon_2 is the longitude of end of the queue
# max_queue_length is the maxmimum queue length (equals to dist, aka queue length)